In [555]:
# <div class="alert alert-block alert-danger">
# <link rel="icon" href="/static/img/favicon-bee.ico">
# </div>

from IPython.display import HTML, display
display(HTML('<link rel="icon" href="/static/img/favicon-bee.ico">'))
In [554]:
import plotly.io as pio

# This ensures Plotly output works in multiple places:
# plotly_mimetype: VS Code notebook UI
# notebook: "Jupyter: Export to HTML" command in VS Code
# See https://plotly.com/python/renderers/#multiple-renderers
pio.renderers.default = "plotly_mimetype+notebook"
In [ ]:
'''add this to the html file 
<link rel="icon" href="/static/img/favicon-bee.ico">
'''

Airbnb prices in European cities¶

Aim of this analysis:¶

  • Data prep

    • add city field
    • add weekend/weekday flag
    • combine all files
  • EDA

    • distribution of prices by city
    • distribution of room type
  • Regression modellng to estimate how different attributes affect listing price

  • Identify spatial clusters of listings

  • Plot clusters on a map

  • Does being close to the cluster centre affect prices?

  • Is there a superhost premium? (Do superhosts charge more?)

  • Are listing prices more expensive on weekends? To what extent? Do we observe the same in all cities?

    • logistic re
    • propensity score matching between a weekday and weekend listing

About the data¶

I used a data set that contains Airbnb listings in 10 European cities, with the following attributes available for each listing:

  • realSum (the total price of the listing)
  • room_type (private/shared/entire home/apt)
  • host_is_superhost (boolean value indicating if host is a superhost or not)
  • multi (indicator whether listing is for multiple rooms or not)
  • biz (business indicator)
  • guest_satisfaction_overall (overall rating from guests camparing all listings offered by host)
  • bedrooms, dist (distance from city center)
  • lng & lat coordinates for location identification etc.

Data source: Kaggle https://www.kaggle.com/datasets/thedevastator/airbnb-prices-in-european-cities

In [528]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm
from scipy.stats.stats import pearsonr

Data preparation¶

Import data

In [529]:
cities_list = ['amsterdam', 'athens', 'barcelona', 'berlin', 'budapest', 
            'lisbon', 'london', 'paris', 'rome', 'vienna']
In [530]:
weekdays_dict = {}
weekends_dict = {}

for city in cities_list:
    df_weekdays = pd.read_csv(f'data/{city}_weekdays.csv')
    df_weekends = pd.read_csv(f'data/{city}_weekends.csv')
    
    # city variable
    df_weekdays['city'] = city
    df_weekends['city'] = city
    
    # dummy variable for weekend
    df_weekdays['is_weekend'] = 0
    df_weekends['is_weekend'] = 1
    
    df_weekdays['week_part'] = 'weekday'
    df_weekends['week_part'] = 'weekend'
    
    weekdays_dict[city] = df_weekdays
    weekends_dict[city] = df_weekends
    print(city, f'weekdays: {len(df_weekdays)}, weekends: {len(df_weekends)}')  
amsterdam weekdays: 1103, weekends: 977
athens weekdays: 2653, weekends: 2627
barcelona weekdays: 1555, weekends: 1278
berlin weekdays: 1284, weekends: 1200
budapest weekdays: 2074, weekends: 1948
lisbon weekdays: 2857, weekends: 2906
london weekdays: 4614, weekends: 5379
paris weekdays: 3130, weekends: 3558
rome weekdays: 4492, weekends: 4535
vienna weekdays: 1738, weekends: 1799

Combine into one dataframe

In [531]:
df = pd.concat(list(weekdays_dict.values())+list(weekends_dict.values())).drop(columns=['Unnamed: 0'])
In [532]:
df['price_per_night'] = df['realSum']/2

Exploratory data analysis¶

averages

In [533]:
df_ = df.groupby('city').agg({'price_per_night':'median', 'room_shared':'mean', 'bedrooms':'mean', 'person_capacity':'mean', 
                              'host_is_superhost':'mean', 'multi':'mean', 'biz':'mean', 
                              'guest_satisfaction_overall':'mean', 'cleanliness_rating':'mean', 
                              'dist':'mean', 'metro_dist':'mean', 
                             'attr_index_norm':'mean', 'rest_index_norm':'mean'}).reset_index().sort_values('price_per_night')
df_
Out[533]:
city price_per_night room_shared bedrooms person_capacity host_is_superhost multi biz guest_satisfaction_overall cleanliness_rating dist metro_dist attr_index_norm rest_index_norm
1 athens 63.857709 0.002083 1.271402 3.698106 0.428598 0.267424 0.379924 95.003598 9.638447 1.803080 0.478656 5.740839 9.954268
4 budapest 76.491047 0.003481 1.105669 3.540776 0.378916 0.303332 0.348831 94.585281 9.477374 1.872763 0.544059 12.675248 34.529089
8 rome 91.295911 0.001329 1.229755 3.357372 0.326687 0.385953 0.334109 93.122300 9.514678 3.026982 0.819794 10.426968 25.078056
3 berlin 95.587548 0.029388 1.070451 2.774960 0.257246 0.276167 0.174718 94.323671 9.461755 5.257093 0.836064 16.803111 30.666967
2 barcelona 104.149696 0.004236 1.161313 2.616661 0.181433 0.385104 0.325450 91.109072 9.291564 2.116982 0.441248 16.636220 19.376528
9 vienna 104.247014 0.004524 1.102347 3.350297 0.284139 0.279050 0.339836 93.731128 9.472434 3.139488 0.526670 8.762474 4.239580
5 lisbon 112.687617 0.012841 1.272428 3.343398 0.213951 0.239459 0.587541 91.093875 9.370640 1.966893 0.711482 7.324730 28.274084
6 london 130.647475 0.005004 1.128790 2.846192 0.157410 0.274992 0.387872 90.645652 9.175023 5.326421 1.005547 20.537398 11.234105
7 paris 158.798583 0.014055 0.972787 2.953648 0.140700 0.219498 0.245813 92.037530 9.263606 2.995823 0.227323 18.204358 42.589111
0 amsterdam 230.122091 0.004808 1.292308 2.781731 0.284135 0.283173 0.105288 94.514423 9.465865 2.825052 1.089367 14.246499 26.097566

standard deviation

In [534]:
df_ = df.groupby('city').agg({'price_per_night':'std', 'room_shared':'std', 
                              'bedrooms':'std', 'person_capacity':'std', 
                              'host_is_superhost':'std',  'multi':'std', 'biz':'std',
                              'guest_satisfaction_overall':'std', 'cleanliness_rating':'std', 
                              'dist':'std', 'metro_dist':'std',
                              'attr_index_norm':'std', 'rest_index_norm':'std'}).reset_index().sort_values('price_per_night')
df_
Out[534]:
city price_per_night room_shared bedrooms person_capacity host_is_superhost multi biz guest_satisfaction_overall cleanliness_rating dist metro_dist attr_index_norm rest_index_norm
5 lisbon 54.486540 0.112596 0.728539 1.344214 0.410128 0.426790 0.492320 9.148114 0.924080 1.742681 0.920204 5.082390 17.877309
8 rome 59.309052 0.036438 0.549710 1.309052 0.469028 0.486847 0.471704 7.815107 0.808415 1.644095 0.631361 6.631054 13.414188
4 budapest 65.572403 0.058903 0.663484 1.256548 0.485177 0.459754 0.476660 6.525680 0.842693 1.874925 0.856410 6.672535 19.111435
3 berlin 117.664645 0.168926 0.552033 1.188142 0.437204 0.447191 0.379802 6.809406 0.849384 3.692649 1.267283 10.774273 16.634505
1 athens 132.940027 0.045600 0.652575 1.284703 0.494922 0.442657 0.485414 8.348637 0.839767 0.953738 0.284154 4.667181 10.778060
7 paris 165.474872 0.117727 0.642571 1.215007 0.347738 0.413937 0.430601 8.818201 0.974036 1.463542 0.122769 7.759372 15.680438
2 barcelona 177.733944 0.064956 0.517108 1.153124 0.385445 0.486706 0.468625 8.607153 1.014577 1.377859 0.284540 9.591798 10.256285
9 vienna 198.873582 0.067115 0.602819 1.282163 0.451067 0.448596 0.473720 7.220808 0.855439 1.942337 0.516132 6.259531 3.683256
0 amsterdam 215.329203 0.069187 0.736683 1.032634 0.451110 0.450648 0.306999 6.350874 0.813421 2.082573 0.831669 10.335158 17.720931
6 london 235.678632 0.070562 0.579477 1.246235 0.364205 0.446533 0.487289 11.510622 1.166180 2.712573 1.263926 11.914575 6.963803
In [535]:
metric_list = ['price_per_night','room_shared','bedrooms','person_capacity',
               'host_is_superhost','multi', 'biz', 'guest_satisfaction_overall','cleanliness_rating',
               'dist', 'metro_dist',  'attr_index_norm', 'rest_index_norm']

highest = []
lowest = []
for metric in metric_list:
    highest.append(df_.sort_values(metric)['city'].values[-1])
    lowest.append(df_.sort_values(metric)['city'].values[0])
    
pd.DataFrame({'metric':metric_list, 'highest':highest, 'lowest':lowest})
Out[535]:
metric highest lowest
0 price_per_night london lisbon
1 room_shared berlin rome
2 bedrooms amsterdam barcelona
3 person_capacity lisbon amsterdam
4 host_is_superhost athens paris
5 multi rome paris
6 biz lisbon amsterdam
7 guest_satisfaction_overall london amsterdam
8 cleanliness_rating london rome
9 dist berlin athens
10 metro_dist berlin paris
11 attr_index_norm london athens
12 rest_index_norm budapest vienna
In [536]:
df['host_has_multiple_listings'] = 'single listing'
df.loc[(df['multi']==1)|(df['biz']==1), 'host_has_multiple_listings'] = 'multiple listings'
df['host_is_biz'] = 'Private'
df.loc[df['multi']==1, 'host_is_biz'] = 'Business'

Table above shows that
Athens have the best guest satisfaction and cleanliness ratings, and also the cheapest despite having the highest person capacity.
Amsterdam is the most expensive, presumably because it has the highest average number of bedrooms.
London has the best attraction index but worst guest satisfaction and cleanliness ratings.

In [537]:
fig = go.Figure()

for week_part in ['weekday', 'weekend']:
    fig.add_trace(go.Box(y=df[df['week_part']==week_part]['price_per_night'], 
                         x=df[df['week_part']==week_part]['city'],
                         #boxpoints=False, 
                         name=week_part))# = px.box(df, x='city', y='realSum', color='week_part')

fig.update_layout(
    boxmode='group',
    font_color="black",
    title_font_color="black",
    title='Box plot of listing prices<sup><br>Listing prices on weekdays and weekends are similar, except in Amsterdam where prices are higher on weekends. </sup>', 
    plot_bgcolor='rgba(0, 0, 0, 0)',
    yaxis=dict(title='two nights for two (euros)', 
               range=[0,800], 
               tickformat=',.0f'),
                  

)    
fig.show()

# df_ = df.groupby(['city', 'week_part']).agg({'person_capacity':'mean'}).reset_index()

# fig = go.Figure()
# for week_part in ['weekday', 'weekend']:
#     fig.add_trace(go.Bar(y=df_[df_['week_part']==week_part]['person_capacity'], 
#                          x=df_[df_['week_part']==week_part]['city'],
#                          text=df_[df_['week_part']==week_part]['person_capacity'],
#                          textposition='outside',
#                          texttemplate='%{text:.1f}',
#                          #boxpoints=False, 
#                          name=week_part))# = px.box(df, x='city', y='realSum', color='week_part')

# fig.update_layout(
#     boxmode='group', title='Average person capacity', plot_bgcolor='rgba(0, 0, 0, 0)',
#     yaxis=dict(title='personal capacity (mean)', tickformat=',.1f', range=[2,4]),
#     height=400
                  

# )    
# fig.show()

# df_ = df.groupby(['city', 'week_part']).agg({'guest_satisfaction_overall':'mean'}).reset_index()

# fig = go.Figure()
# for week_part in ['weekday', 'weekend']:
#     fig.add_trace(go.Bar(y=df_[df_['week_part']==week_part]['guest_satisfaction_overall'], 
#                          x=df_[df_['week_part']==week_part]['city'],
#                          text=df_[df_['week_part']==week_part]['guest_satisfaction_overall'],
#                          textposition='outside',
#                          texttemplate='%{text:.1f}',
#                          #boxpoints=False, 
#                          name=week_part))# = px.box(df, x='city', y='realSum', color='week_part')

# fig.update_layout(
#     boxmode='group', title='Guest satisfaction rating', plot_bgcolor='rgba(0, 0, 0, 0)',
#     yaxis=dict(title='personal capacity (mean)', tickformat=',.0f', range=[90,100]
#               ),
#     height=400
                  

# )    
# fig.show()


# df_ = df.groupby(['city', 'week_part']).agg({'host_is_superhost':'mean'}).reset_index()

# fig = go.Figure()
# for week_part in ['weekday', 'weekend']:
#     fig.add_trace(go.Bar(y=df_[df_['week_part']==week_part]['host_is_superhost'], 
#                          x=df_[df_['week_part']==week_part]['city'],
#                          text=df_[df_['week_part']==week_part]['host_is_superhost'],
#                          textposition='outside',
#                          texttemplate='%{text:.1f}',
#                          #boxpoints=False, 
#                          name=week_part))# = px.box(df, x='city', y='realSum', color='week_part')

# fig.update_layout(
#     boxmode='group', title='Distance from metro', plot_bgcolor='rgba(0, 0, 0, 0)',
#     yaxis=dict(title='metro distance', tickformat=',.1f',# range=[90,100]
#               ),
#     height=400
                  

# )    
# fig.show()
In [538]:
fig = go.Figure()

for biz_status in ['Private', 'Business']:
    fig.add_trace(go.Box(y=df[df['host_is_biz']==biz_status]['price_per_night'], 
                         x=df[df['host_is_biz']==biz_status]['city'],
                         #boxpoints=False, 
                         name=biz_status))# = px.box(df, x='city', y='realSum', color='week_part')

fig.update_layout(
    boxmode='group',
    font_color="black",
    title_font_color="black",
    title='Box plot of listing prices - private vs business<sup><br>Listing prices for business hosts are slightly lower than private, especially in Amsterdam. </sup>', 
    plot_bgcolor='rgba(0, 0, 0, 0)',
    yaxis=dict(title='two nights for two (euros)', 
               range=[0,800], 
               tickformat=',.0f'),
                  

)    
fig.show()
In [539]:
fig = go.Figure()

for biz_status in ['single listing', 'multiple listings']:
    fig.add_trace(go.Box(y=df[df['host_has_multiple_listings']==biz_status]['price_per_night'], 
                         x=df[df['host_has_multiple_listings']==biz_status]['city'],
                         #boxpoints=False, 
                         name=biz_status))# = px.box(df, x='city', y='realSum', color='week_part')

fig.update_layout(
    boxmode='group',
    font_color="black",
    title_font_color="black",
    title='Box plot of listing prices - single vs multiple listings<sup><br>Listing prices for business hosts are slightly lower than private, especially in Amsterdam. </sup>', 
    plot_bgcolor='rgba(0, 0, 0, 0)',
    yaxis=dict(title='two nights for two (euros)', 
               range=[0,800], 
               tickformat=',.0f'),
                  

)    
fig.show()

Correlation analysis¶

Quick analysis of the Pearson correlation coefficients* between variables

* Pearson correlation coefficient is the ratio between the covariance of the two variables and the product of their standard deviations. So it is essentially a normalized measurement of covariance.

In [540]:
vars_of_interest = ['price_per_night','room_shared','bedrooms','person_capacity',
                    'host_is_superhost', 'multi', 'biz',
                    'guest_satisfaction_overall','cleanliness_rating',
                    'dist', 'metro_dist', 'attr_index_norm', 'rest_index_norm']
df_corr = df[vars_of_interest].corr().round(2)

mask = np.triu(np.ones_like(df_corr, dtype=bool))
df_corr_vis = df_corr.mask(mask)

fig = px.imshow(df_corr_vis, text_auto=True, 
               color_continuous_scale=px.colors.diverging.PRGn, 
               color_continuous_midpoint=0)
fig.update_layout(plot_bgcolor='rgba(0, 0, 0, 0)',
                  title='Pearson correlation coefficients',
                  xaxis=dict(tickangle=90), 
                  height=600
                 )

fig.show()
In [541]:
# just london
df_corr = df[df['city']=='london'][vars_of_interest].corr().round(2)

mask = np.triu(np.ones_like(df_corr, dtype=bool))
df_corr_vis = df_corr.mask(mask)

fig = px.imshow(df_corr_vis, text_auto=True, 
               color_continuous_scale=px.colors.diverging.PRGn, 
               color_continuous_midpoint=0)
fig.update_layout(plot_bgcolor='rgba(0, 0, 0, 0)',
                  title='Pearson correlation coefficients',
                  xaxis=dict(tickangle=90), 
                  height=600
                 )

fig.show()

The correlation matrix above shows the correlation between variables.
Unsurprisingly, the following variables are positively correlated:

  • guest satisfaction score & cleanliness rating & being a super host
  • price per night & person capacity & number of bedrooms
  • distance from city centre & distance from metro

These correlations are statistically significant, as shown in p-value matrix below.

In [542]:
df_pvals = pd.DataFrame(index=vars_of_interest)
for col_metric in vars_of_interest:
    pvals_list = []
    for row_metric in vars_of_interest:
        pvals_list.append(pearsonr(df[col_metric],df[row_metric])[1].round(2))
    df_pvals[col_metric] = pvals_list
In [543]:
df_pvals
df_pvals_vis = df_pvals.mask(mask)

fig = px.imshow(df_pvals_vis, text_auto=True, 
               color_continuous_scale=px.colors.sequential.Purples_r, 
               color_continuous_midpoint=0.1
               )
fig.update_layout(plot_bgcolor='rgba(0, 0, 0, 0)',
                  title='''p-values of the Pearson correlation coefficients
                  <br><sup>(zeros mean that the correlations is so statistically significant that the p-value is smaller than the smallest possible floating point)</sup>''',
                  xaxis=dict(tickangle=90), 
                  height=600
                 )

fig.show()

Fitting OLS trendlines for key variables by city¶

The factors influecing listing prices can differ across cities. For example:

  • While distance negatively correlated with prices for all cities, it is particularly strong for Amsterdam
  • Guest satisfaction is correlated with prices only in Amsterdam, Athens, Lisbon, Paris, and Rome.
In [544]:
fig = px.scatter(df, 
                x='dist', 
                y='price_per_night',
                hover_data=['price_per_night', 'dist'],
                color='city',
                facet_col='city', 
                facet_col_wrap=5,
                title='Distance from centre vs price',
                trendline='ols',
                height=600, width=1000
                #barmode='group'
                )

fig.update_layout(
    font_color="black",
    title_font_color="black",
    plot_bgcolor='rgba(0, 0, 0, 0)', 
    showlegend=False)

trendlines = px.get_trendline_results(fig).set_index('city')

# remove annotation text (added below)
fig.for_each_annotation(lambda a: a.update(text=''))#f'{a.text.split("=")[-1]}'))

for j, row in enumerate(fig._grid_ref):
    for i, col in enumerate(row):
        k = j*5+i
        city = fig['data'][k*2]['name']
        xaxis = fig['data'][k*2]['xaxis']
        yaxis = fig['data'][k*2]['yaxis']
        x_max = np.max(fig['data'][k*2]['x'])
        intercept = trendlines.loc[city]['px_fit_results'].params[0]
        slope = trendlines.loc[city]['px_fit_results'].params[1]
        r2 = trendlines.loc[city]['px_fit_results'].rsquared
        intercept_pval = trendlines.loc[city]['px_fit_results'].pvalues[0]
        slope_pval = trendlines.loc[city]['px_fit_results'].pvalues[1]
        fig.add_annotation(
                           xref=xaxis, x=x_max, 
                            yref=yaxis, y=700,
                           text=f'{city}<br>slope: {slope:.2f}<br>(p-val {slope_pval:.2f})<br>r2: {r2: .2f}', 
                            showarrow=False,
                            align='right'
        )
        
                
fig.update_xaxes(matches=None, tickformat='.0f', #title='distance from centre'
                )
fig.update_yaxes(matches=None, tickformat='.0f', range=[0,800], #title='price per night (euros)'
                )
fig.show()
In [545]:
fig = px.scatter(df,#[df['person_capacity']==2], 
                x='guest_satisfaction_overall', 
                y='price_per_night',
                hover_data=['price_per_night', 'guest_satisfaction_overall'],
                color='city',
                facet_col='city', 
                facet_col_wrap=5,
                title='Guest satisfaction vs price',
                trendline='ols',
                height=550, width=1000
                #barmode='group'
                )

fig.update_layout(
    font_color="black",
    title_font_color="black",
    plot_bgcolor='rgba(0, 0, 0, 0)', 
    showlegend=False)

trendlines = px.get_trendline_results(fig).set_index('city')

# remove annotation text (added below)
fig.for_each_annotation(lambda a: a.update(text=''))#f'{a.text.split("=")[-1]}'))

for j, row in enumerate(fig._grid_ref):
    for i, col in enumerate(row):
        k = j*5+i
        city = fig['data'][k*2]['name']
        xaxis = fig['data'][k*2]['xaxis']
        yaxis = fig['data'][k*2]['yaxis']
        x_max = np.max(fig['data'][k*2]['x'])
        intercept = trendlines.loc[city]['px_fit_results'].params[0]
        slope = trendlines.loc[city]['px_fit_results'].params[1]
        r2 = trendlines.loc[city]['px_fit_results'].rsquared
        intercept_pval = trendlines.loc[city]['px_fit_results'].pvalues[0]
        slope_pval = trendlines.loc[city]['px_fit_results'].pvalues[1]
        fig.add_annotation(
                           xref=xaxis, x=0.5, 
                            yref=yaxis, y=700,
                           text=f'{city}<br>slope: {slope:.2f}<br>(p-val {slope_pval:.2f})<br>r2: {r2: .2f}', 
                            showarrow=False,
                            align='right'
        )

fig.add_annotation(
   xref='paper', x=.5, 
    yref='paper', y=-.1,
   text=f'guest satisfaction', 
    showarrow=False,
    align='right'
)
                
fig.update_xaxes(matches=None, tickformat='.0f', title=''
                )
fig.update_yaxes(matches=None, tickformat='.0f', range=[0,800] #title='price per night (euros)'
                )
fig.show()
In [546]:
fig = px.scatter(#df[df['person_capacity']==2], 
                df,
                x='metro_dist', 
                y='guest_satisfaction_overall',
                hover_data=['guest_satisfaction_overall', 'metro_dist'],
                color='city',
                facet_col='city', 
                facet_col_wrap=5,
                title='Distance from metro vs guest satisfaction',
                trendline='ols',
                height=600, width=1000
                #barmode='group'
                )

fig.update_layout(
    font_color="black",
    title_font_color="black",
    plot_bgcolor='rgba(0, 0, 0, 0)', 
    showlegend=False)

trendlines = px.get_trendline_results(fig).set_index('city')

# remove annotation text (added below)
fig.for_each_annotation(lambda a: a.update(text=''))#f'{a.text.split("=")[-1]}'))

for j, row in enumerate(fig._grid_ref):
    for i, col in enumerate(row):
        k = j*5+i
        city = fig['data'][k*2]['name']
        xaxis = fig['data'][k*2]['xaxis']
        yaxis = fig['data'][k*2]['yaxis']
        x_max = np.max(fig['data'][k*2]['x'])
        intercept = trendlines.loc[city]['px_fit_results'].params[0]
        slope = trendlines.loc[city]['px_fit_results'].params[1]
        r2 = trendlines.loc[city]['px_fit_results'].rsquared
        intercept_pval = trendlines.loc[city]['px_fit_results'].pvalues[0]
        slope_pval = trendlines.loc[city]['px_fit_results'].pvalues[1]
        fig.add_annotation(
                           xref=xaxis, x=x_max, 
                            yref=yaxis, y=2,
                           text=f'{city}<br>slope: {slope:.2f}<br>(p-val {slope_pval:.2f})<br>r2: {r2: .2f}', 
                            showarrow=False,
                            align='right'
        )
        
                
fig.update_xaxes(matches=None, tickformat='.0f', #title='distance from centre'
                )
fig.update_yaxes(matches=None, tickformat='.0f',# range=[0,800] #title='price per night (euros)'
                )
fig.show()
In [547]:
results = px.get_trendline_results(fig)
In [548]:
trendlines.loc['amsterdam']['px_fit_results'].summary()
Out[548]:
OLS Regression Results
Dep. Variable: y R-squared: 0.001
Model: OLS Adj. R-squared: 0.001
Method: Least Squares F-statistic: 2.370
Date: Tue, 14 Mar 2023 Prob (F-statistic): 0.124
Time: 09:50:02 Log-Likelihood: -6794.8
No. Observations: 2080 AIC: 1.359e+04
Df Residuals: 2078 BIC: 1.360e+04
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 94.2337 0.229 410.713 0.000 93.784 94.684
x1 0.2577 0.167 1.539 0.124 -0.071 0.586
Omnibus: 1818.334 Durbin-Watson: 1.265
Prob(Omnibus): 0.000 Jarque-Bera (JB): 93279.298
Skew: -3.872 Prob(JB): 0.00
Kurtosis: 34.880 Cond. No. 3.14


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
In [549]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 51707 entries, 0 to 1798
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   realSum                     51707 non-null  float64
 1   room_type                   51707 non-null  object 
 2   room_shared                 51707 non-null  bool   
 3   room_private                51707 non-null  bool   
 4   person_capacity             51707 non-null  float64
 5   host_is_superhost           51707 non-null  bool   
 6   multi                       51707 non-null  int64  
 7   biz                         51707 non-null  int64  
 8   cleanliness_rating          51707 non-null  float64
 9   guest_satisfaction_overall  51707 non-null  float64
 10  bedrooms                    51707 non-null  int64  
 11  dist                        51707 non-null  float64
 12  metro_dist                  51707 non-null  float64
 13  attr_index                  51707 non-null  float64
 14  attr_index_norm             51707 non-null  float64
 15  rest_index                  51707 non-null  float64
 16  rest_index_norm             51707 non-null  float64
 17  lng                         51707 non-null  float64
 18  lat                         51707 non-null  float64
 19  city                        51707 non-null  object 
 20  is_weekend                  51707 non-null  int64  
 21  week_part                   51707 non-null  object 
 22  price_per_night             51707 non-null  float64
 23  host_has_multiple_listings  51707 non-null  object 
 24  host_is_biz                 51707 non-null  object 
dtypes: bool(3), float64(13), int64(4), object(5)
memory usage: 9.2+ MB

Regression modelling¶

Feature engineering¶

In [550]:
# create dummy variables for cities
city_dummies = pd.get_dummies(df['city'], prefix='city')
# create dummy variables for room type
room_type_dummies = pd.get_dummies(df['room_type'], prefix='room_type')
In [551]:
df_all_features = pd.concat([df, city_dummies, room_type_dummies], axis=1)

for col in ['room_shared', 'host_is_superhost']:
    df_all_features[col] = 1*df_all_features[col]
    
x = df_all_features[[#'room_shared', 
                     'person_capacity', 'bedrooms', 'host_is_superhost', 'biz', 
       'cleanliness_rating', 'guest_satisfaction_overall', 
      'dist', 'metro_dist', 'attr_index_norm', 'rest_index_norm', 'is_weekend', 
      ]+city_dummies.columns.tolist()+room_type_dummies.columns.tolist()].drop(columns=['city_lisbon', 'room_type_Shared room'])

y =np.array(df_all_features['price_per_night']).astype(float)

x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
model.summary()
Out[551]:
OLS Regression Results
Dep. Variable: y R-squared: 0.241
Model: OLS Adj. R-squared: 0.241
Method: Least Squares F-statistic: 747.9
Date: Tue, 14 Mar 2023 Prob (F-statistic): 0.00
Time: 09:50:06 Log-Likelihood: -3.2991e+05
No. Observations: 51707 AIC: 6.599e+05
Df Residuals: 51684 BIC: 6.601e+05
Df Model: 22
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const -149.2155 10.910 -13.677 0.000 -170.599 -127.832
person_capacity 12.1119 0.692 17.505 0.000 10.756 13.468
bedrooms 43.9391 1.256 34.996 0.000 41.478 46.400
host_is_superhost -0.4530 1.539 -0.294 0.769 -3.470 2.564
biz 13.9951 1.428 9.800 0.000 11.196 16.794
cleanliness_rating 3.1463 0.952 3.305 0.001 1.280 5.012
guest_satisfaction_overall 0.3462 0.103 3.346 0.001 0.143 0.549
dist -0.2372 0.489 -0.485 0.628 -1.196 0.722
metro_dist -4.4357 0.947 -4.685 0.000 -6.291 -2.580
attr_index_norm 3.2576 0.113 28.825 0.000 3.036 3.479
rest_index_norm 0.0013 0.066 0.020 0.984 -0.127 0.130
is_weekend 2.7205 1.262 2.156 0.031 0.248 5.193
city_amsterdam 165.3610 3.876 42.662 0.000 157.764 172.958
city_athens -57.4367 3.019 -19.027 0.000 -63.353 -51.520
city_barcelona 41.4598 3.688 11.242 0.000 34.231 48.688
city_berlin 12.9153 4.155 3.109 0.002 4.772 21.058
city_budapest -55.2634 3.050 -18.120 0.000 -61.241 -49.286
city_london 50.1832 3.756 13.361 0.000 42.821 57.545
city_paris 57.5275 3.156 18.229 0.000 51.342 63.713
city_rome -18.8338 2.552 -7.381 0.000 -23.835 -13.833
city_vienna -0.2127 3.507 -0.061 0.952 -7.086 6.660
room_type_Entire home/apt 100.5635 7.565 13.293 0.000 85.736 115.391
room_type_Private room 42.8076 7.644 5.600 0.000 27.826 57.789
Omnibus: 134661.840 Durbin-Watson: 1.908
Prob(Omnibus): 0.000 Jarque-Bera (JB): 4190421196.059
Skew: 30.271 Prob(JB): 0.00
Kurtosis: 1396.318 Cond. No. 2.17e+03


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.17e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [552]:
df_all_features = pd.concat([df, city_dummies, room_type_dummies], axis=1)

for col in ['room_shared', 'host_is_superhost']:
    df_all_features[col] = 1*df_all_features[col]
    
x = df_all_features[df_all_features['city']=='london'][[#'room_shared', 
                                                        'person_capacity', 'bedrooms', 'host_is_superhost', 'biz', 
       'cleanliness_rating', 'guest_satisfaction_overall', 
      'dist', 'metro_dist', 'attr_index_norm', 'rest_index_norm', 'is_weekend', 
      ]+room_type_dummies.columns.tolist()].drop(columns=['room_type_Shared room'])

y =np.array(df_all_features[df_all_features['city']=='london']['price_per_night']).astype(float)

x = sm.add_constant(x)
model = sm.OLS(y, x).fit()
model.summary()
Out[552]:
OLS Regression Results
Dep. Variable: y R-squared: 0.231
Model: OLS Adj. R-squared: 0.230
Method: Least Squares F-statistic: 230.4
Date: Tue, 14 Mar 2023 Prob (F-statistic): 0.00
Time: 09:50:07 Log-Likelihood: -67454.
No. Observations: 9993 AIC: 1.349e+05
Df Residuals: 9979 BIC: 1.350e+05
Df Model: 13
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const -196.9080 37.294 -5.280 0.000 -270.012 -123.804
person_capacity 17.6280 2.470 7.137 0.000 12.786 22.470
bedrooms 85.6694 4.524 18.936 0.000 76.801 94.537
host_is_superhost 9.4253 5.916 1.593 0.111 -2.171 21.022
biz -7.8009 4.784 -1.631 0.103 -17.178 1.576
cleanliness_rating 1.1475 2.728 0.421 0.674 -4.200 6.495
guest_satisfaction_overall 0.5227 0.283 1.846 0.065 -0.032 1.078
dist 4.6404 1.660 2.795 0.005 1.386 7.895
metro_dist -11.1501 2.388 -4.669 0.000 -15.831 -6.469
attr_index_norm 4.6970 0.461 10.184 0.000 3.793 5.601
rest_index_norm 0.0653 0.702 0.093 0.926 -1.310 1.440
is_weekend -1.7089 4.163 -0.410 0.681 -9.870 6.452
room_type_Entire home/apt 121.7948 29.541 4.123 0.000 63.888 179.701
room_type_Private room 21.0085 29.439 0.714 0.475 -36.699 78.716
Omnibus: 22537.598 Durbin-Watson: 1.968
Prob(Omnibus): 0.000 Jarque-Bera (JB): 155015487.623
Skew: 21.373 Prob(JB): 0.00
Kurtosis: 611.663 Cond. No. 2.42e+03


Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.42e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
In [ ]: